In [1]:
import pandas as pd
import numpy as np
from matplotlib import gridspec
import matplotlib.pyplot as plt
from rdkit import Chem, DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import Draw
from scipy.cluster.hierarchy import dendrogram, linkage
import plotly.express as px
import umap

import ibis
ibis.set_backend("duckdb")
ibis.options.interactive = True
from ibis import _
import ibis.selectors as s
import warnings
warnings.filterwarnings('ignore')
/Users/jordanramsdell/mambaforge/envs/ml_ibis/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Goal¶

  • We are going to explore some of the datasets (which you can download from here)
  • We'll demonstrate some useful features of Ibis in terms of dealing with deeply nested data
  • We'll try to embed a subset of molecules (from Open Target's molecules dataset) to see how they cluster according to similarity

Example: reading parquet files (using Ibis) into tables¶

In [2]:
t_targets = ibis.read_parquet("../../../data/open_targets/targets/")
t_targets
Out[2]:
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ id              ┃ approvedSymbol ┃ biotype              ┃ transcriptIds                                   ┃ canonicalTranscript                                                              ┃ canonicalExons                      ┃ genomicLocation                                                     ┃ alternativeGenes    ┃ approvedName                                            ┃ go                                                                               ┃ hallmarks ┃ synonyms                                     ┃ symbolSynonyms                               ┃ nameSynonyms                                 ┃ functionDescriptions                                                                                                                                                             ┃ subcellularLocations                                                             ┃ targetClass                                            ┃ obsoleteSymbols                              ┃ obsoleteNames                                ┃ constraint                                                                       ┃ tep  ┃ proteinIds                                ┃ dbXrefs                                   ┃ chemicalProbes ┃ homologues                                                                       ┃ tractability                                                ┃ safetyLiabilities                                                                ┃ pathways                                                                ┃
┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ string          │ string         │ string               │ array<string>                                   │ struct<id: string, chromosome: string, start: int64, end: int64, strand: string> │ array<string>                       │ struct<chromosome: string, start: int64, end: int64, strand: int32> │ array<string>       │ string                                                  │ array<struct<id: string, source: string, evidence: string, aspect: string, gene… │ struct<a… │ array<struct<label: string, source: string>> │ array<struct<label: string, source: string>> │ array<struct<label: string, source: string>> │ array<string>                                                                                                                                                                    │ array<struct<location: string, source: string, termSL: string, labelSL: string>> │ array<struct<id: int64, label: string, level: string>> │ array<struct<label: string, source: string>> │ array<struct<label: string, source: string>> │ array<struct<constraintType: string, score: float32, exp: float32, obs: int32, … │ str… │ array<struct<id: string, source: string>> │ array<struct<id: string, source: string>> │ array<struct<… │ array<struct<speciesId: string, speciesName: string, homologyType: string, targ… │ array<struct<modality: string, id: string, value: boolean>> │ array<struct<event: string, eventId: string, effects: array<struct<direction: s… │ array<struct<pathwayId: string, pathway: string, topLevelTerm: string>> │
├─────────────────┼────────────────┼──────────────────────┼─────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼─────────────────────────────────────┼─────────────────────────────────────────────────────────────────────┼─────────────────────┼─────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼───────────┼──────────────────────────────────────────────┼──────────────────────────────────────────────┼──────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────┼──────────────────────────────────────────────┼──────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼──────┼───────────────────────────────────────────┼───────────────────────────────────────────┼────────────────┼──────────────────────────────────────────────────────────────────────────────────┼─────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼─────────────────────────────────────────────────────────────────────────┤
│ ENSG00000020219 │ CCT8L1P        │ processed_pseudogene │ ['ENST00000465400']                             │ {'id': 'ENST00000465400', 'chromosome': '7', ... +3}                             │ ['152445477', '152447150']          │ {'chromosome': '7', 'start': 152445477, ... +2}                     │ NULL                │ chaperonin containing TCP1 subunit 8 like 1, pseudogene │ NULL                                                                             │ NULL      │ [{...}, {...}, ... +4]                       │ [{...}, {...}, ... +2]                       │ [{...}, {...}]                               │ ['Possible molecular chaperone; assists the folding of proteins upon ATP hydrolysi'+4]                                                                                           │ [{...}]                                                                          │ NULL                                                   │ [{...}]                                      │ [{...}, {...}]                               │ NULL                                                                             │ NULL │ [{...}]                                   │ [{...}, {...}, ... +4]                    │ NULL           │ NULL                                                                             │ NULL                                                        │ NULL                                                                             │ NULL                                                                    │
│ ENSG00000059588 │ TARBP1         │ protein_coding       │ ['ENST00000496673', 'ENST00000483404', ... +7]  │ {'id': 'ENST00000040877', 'chromosome': '1', ... +3}                             │ ['234420702', '234420812', ... +58] │ {'chromosome': '1', 'start': 234391313, ... +2}                     │ NULL                │ TAR (HIV-1) RNA binding protein 1                       │ [{...}, {...}, ... +3]                                                           │ NULL      │ [{...}, {...}, ... +20]                      │ [{...}, {...}, ... +10]                      │ [{...}, {...}, ... +8]                       │ ['Probable S-adenosyl-L-methionine-dependent methyltransferase which methylates RN'+28, '(Microbial infection) In case of infection by HIV-1, it binds to the loop region'+444]  │ [{...}]                                                                          │ [{...}]                                                │ []                                           │ [{...}]                                      │ [{...}, {...}, ... +1]                                                           │ NULL │ [{...}, {...}]                            │ [{...}, {...}, ... +8]                    │ NULL           │ [{...}, {...}, ... +11]                                                          │ [{...}, {...}, ... +26]                                     │ NULL                                                                             │ NULL                                                                    │
│ ENSG00000070182 │ SPTB           │ protein_coding       │ ['ENST00000553938', 'ENST00000389720', ... +5]  │ {'id': 'ENST00000644917', 'chromosome': '14', ... +3}                            │ ['64785537', '64785627', ... +70]   │ {'chromosome': '14', 'start': 64746283, ... +2}                     │ NULL                │ spectrin beta, erythrocytic                             │ [{...}, {...}, ... +43]                                                          │ NULL      │ [{...}, {...}, ... +15]                      │ [{...}, {...}, ... +5]                       │ [{...}, {...}, ... +8]                       │ ['Spectrin is the major constituent of the cytoskeletal network underlying the ery'+139]                                                                                         │ [{...}, {...}]                                                                   │ NULL                                                   │ []                                           │ []                                           │ [{...}, {...}, ... +1]                                                           │ NULL │ [{...}, {...}, ... +2]                    │ [{...}, {...}, ... +15]                   │ NULL           │ [{...}, {...}, ... +22]                                                          │ [{...}, {...}, ... +26]                                     │ NULL                                                                             │ [{...}, {...}, ... +2]                                                  │
│ ENSG00000070366 │ SMG6           │ protein_coding       │ ['ENST00000354901', 'ENST00000570756', ... +18] │ {'id': 'ENST00000263073', 'chromosome': '17', ... +3}                            │ ['2172658', '2172859', ... +36]     │ {'chromosome': '17', 'start': 2059839, ... +2}                      │ NULL                │ SMG6 nonsense mediated mRNA decay factor                │ [{...}, {...}, ... +47]                                                          │ NULL      │ [{...}, {...}, ... +23]                      │ [{...}, {...}, ... +11]                      │ [{...}, {...}, ... +10]                      │ ['Component of the telomerase ribonucleoprotein (RNP) complex that is essential fo'+685, 'Plays a role in nonsense-mediated mRNA decay (PubMed:18974281, PubMed:19060897, '+586] │ [{...}, {...}, ... +3]                                                           │ NULL                                                   │ [{...}]                                      │ [{...}, {...}, ... +2]                       │ [{...}, {...}, ... +1]                                                           │ NULL │ [{...}, {...}, ... +12]                   │ [{...}, {...}, ... +13]                   │ NULL           │ [{...}, {...}, ... +10]                                                          │ [{...}, {...}, ... +26]                                     │ NULL                                                                             │ [{...}]                                                                 │
│ ENSG00000072071 │ ADGRL1         │ protein_coding       │ ['ENST00000361434', 'ENST00000589616', ... +6]  │ {'id': 'ENST00000361434', 'chromosome': '19', ... +3}                            │ ['14160112', '14160297', ... +44]   │ {'chromosome': '19', 'start': 14147743, ... +2}                     │ ['ENSG00000288324'] │ adhesion G protein-coupled receptor L1                  │ [{...}, {...}, ... +21]                                                          │ NULL      │ [{...}, {...}, ... +20]                      │ [{...}, {...}, ... +11]                      │ [{...}, {...}, ... +7]                       │ ['Calcium-independent receptor of high affinity for alpha- latrotoxin, an excitato'+308]                                                                                         │ [{...}, {...}, ... +2]                                                           │ NULL                                                   │ [{...}]                                      │ [{...}]                                      │ [{...}, {...}, ... +1]                                                           │ NULL │ [{...}, {...}, ... +5]                    │ [{...}, {...}, ... +14]                   │ NULL           │ [{...}, {...}, ... +18]                                                          │ [{...}, {...}, ... +26]                                     │ NULL                                                                             │ NULL                                                                    │
│ ENSG00000073536 │ NLE1           │ protein_coding       │ ['ENST00000589367', 'ENST00000360831', ... +5]  │ {'id': 'ENST00000442241', 'chromosome': '17', ... +3}                            │ ['35133339', '35133498', ... +24]   │ {'chromosome': '17', 'start': 35128730, ... +2}                     │ NULL                │ notchless homolog 1                                     │ [{...}, {...}, ... +12]                                                          │ NULL      │ [{...}, {...}, ... +7]                       │ [{...}, {...}, ... +3]                       │ [{...}, {...}, ... +2]                       │ ['Plays a role in regulating Notch activity. Plays a role in regulating the expres'+176]                                                                                         │ [{...}, {...}, ... +1]                                                           │ NULL                                                   │ [{...}]                                      │ [{...}]                                      │ [{...}, {...}, ... +1]                                                           │ NULL │ [{...}, {...}, ... +6]                    │ [{...}, {...}, ... +8]                    │ NULL           │ [{...}, {...}, ... +10]                                                          │ [{...}, {...}, ... +26]                                     │ NULL                                                                             │ NULL                                                                    │
│ ENSG00000075290 │ WNT8B          │ protein_coding       │ ['ENST00000343737']                             │ {'id': 'ENST00000343737', 'chromosome': '10', ... +3}                            │ ['100479874', '100480012', ... +10] │ {'chromosome': '10', 'start': 100463009, ... +2}                    │ NULL                │ Wnt family member 8B                                    │ [{...}, {...}, ... +18]                                                          │ NULL      │ [{...}, {...}, ... +4]                       │ [{...}, {...}]                               │ [{...}, {...}, ... +2]                       │ ['Ligand for members of the frizzled family of seven transmembrane receptors. May '+121]                                                                                         │ [{...}]                                                                          │ NULL                                                   │ []                                           │ [{...}]                                      │ [{...}, {...}, ... +1]                                                           │ NULL │ [{...}, {...}, ... +4]                    │ [{...}, {...}, ... +9]                    │ NULL           │ [{...}, {...}, ... +14]                                                          │ [{...}, {...}, ... +26]                                     │ NULL                                                                             │ [{...}, {...}, ... +2]                                                  │
│ ENSG00000083454 │ P2RX5          │ protein_coding       │ ['ENST00000552276', 'ENST00000551178', ... +10] │ {'id': 'ENST00000225328', 'chromosome': '17', ... +3}                            │ ['3695869', '3696155', ... +22]     │ {'chromosome': '17', 'start': 3672199, ... +2}                      │ NULL                │ purinergic receptor P2X 5                               │ [{...}, {...}, ... +18]                                                          │ NULL      │ [{...}, {...}, ... +14]                      │ [{...}, {...}, ... +6]                       │ [{...}, {...}, ... +6]                       │ ['Receptor for ATP that acts as a ligand-gated ion channel.']                                                                                                                    │ [{...}, {...}]                                                                   │ [{...}, {...}, ... +1]                                 │ []                                           │ [{...}]                                      │ [{...}, {...}, ... +1]                                                           │ NULL │ [{...}, {...}, ... +11]                   │ [{...}, {...}, ... +6]                    │ NULL           │ [{...}, {...}, ... +11]                                                          │ [{...}, {...}, ... +26]                                     │ [{...}, {...}, ... +6]                                                           │ [{...}, {...}]                                                          │
│ ENSG00000083782 │ EPYC           │ protein_coding       │ ['ENST00000261172', 'ENST00000551767', ... +1]  │ {'id': 'ENST00000261172', 'chromosome': '12', ... +3}                            │ ['90970044', '90970139', ... +12]   │ {'chromosome': '12', 'start': 90963682, ... +2}                     │ NULL                │ epiphycan                                               │ [{...}, {...}, ... +5]                                                           │ NULL      │ [{...}, {...}, ... +20]                      │ [{...}, {...}, ... +10]                      │ [{...}, {...}, ... +8]                       │ ['May have a role in bone formation and also in establishing the ordered structure'+42]                                                                                          │ [{...}]                                                                          │ NULL                                                   │ [{...}]                                      │ [{...}, {...}]                               │ [{...}, {...}, ... +1]                                                           │ NULL │ [{...}, {...}, ... +3]                    │ [{...}, {...}, ... +6]                    │ NULL           │ [{...}, {...}, ... +13]                                                          │ [{...}, {...}, ... +26]                                     │ NULL                                                                             │ NULL                                                                    │
│ ENSG00000086200 │ IPO11          │ protein_coding       │ ['ENST00000409296', 'ENST00000506905', ... +12] │ {'id': 'ENST00000325324', 'chromosome': '5', ... +3}                             │ ['62442983', '62443083', ... +58]   │ {'chromosome': '5', 'start': 62403972, ... +2}                      │ NULL                │ importin 11                                             │ [{...}, {...}, ... +7]                                                           │ NULL      │ [{...}, {...}, ... +12]                      │ [{...}, {...}, ... +6]                       │ [{...}, {...}, ... +4]                       │ ['Functions in nuclear protein import as nuclear transport receptor. Serves as rec'+807]                                                                                         │ [{...}, {...}, ... +2]                                                           │ NULL                                                   │ []                                           │ []                                           │ [{...}, {...}, ... +1]                                                           │ NULL │ [{...}, {...}, ... +12]                   │ [{...}, {...}, ... +4]                    │ NULL           │ [{...}, {...}, ... +14]                                                          │ [{...}, {...}, ... +26]                                     │ NULL                                                                             │ NULL                                                                    │
│ …               │ …              │ …                    │ …                                               │ …                                                                                │ …                                   │ …                                                                   │ …                   │ …                                                       │ …                                                                                │ …         │ …                                            │ …                                            │ …                                            │ …                                                                                                                                                                                │ …                                                                                │ …                                                      │ …                                            │ …                                            │ …                                                                                │ …    │ …                                         │ …                                         │ …              │ …                                                                                │ …                                                           │ …                                                                                │ …                                                                       │
└─────────────────┴────────────────┴──────────────────────┴─────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────┴─────────────────────────────────────────────────────────────────────┴─────────────────────┴─────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴───────────┴──────────────────────────────────────────────┴──────────────────────────────────────────────┴──────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────┴──────────────────────────────────────────────┴──────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴──────┴───────────────────────────────────────────┴───────────────────────────────────────────┴────────────────┴──────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────────────────────────────────────┘

Example: dealing with nested data in tables¶

  • Initial subcellular locations table is deeply nested
In [3]:
t_targets.subcellularLocations
Out[3]:
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ subcellularLocations                                                             ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ array<struct<location: string, source: string, termSL: string, labelSL: string>> │
├──────────────────────────────────────────────────────────────────────────────────┤
│ [{...}]                                                                          │
│ [{...}]                                                                          │
│ [{...}, {...}]                                                                   │
│ [{...}, {...}, ... +3]                                                           │
│ [{...}, {...}, ... +2]                                                           │
│ [{...}, {...}, ... +1]                                                           │
│ [{...}]                                                                          │
│ [{...}, {...}]                                                                   │
│ [{...}]                                                                          │
│ [{...}, {...}, ... +2]                                                           │
│ …                                                                                │
└──────────────────────────────────────────────────────────────────────────────────┘
  • Here we unnest subCellularLocatins and pair it with target IDs
  • We will get a new table where for each struct in an array, a new row is created
In [4]:
(t_targets
 .select("id", _.subcellularLocations.unnest()) # "_" references output (table) from previous call; works with chaining
)
Out[4]:
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ id              ┃ subcellularLocations                                                      ┃
┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ string          │ struct<location: string, source: string, termSL: string, labelSL: string> │
├─────────────────┼───────────────────────────────────────────────────────────────────────────┤
│ ENSG00000020219 │ {'location': 'Cytoplasm', 'source': 'uniprot', ... +2}                    │
│ ENSG00000059588 │ {'location': 'Nuclear speckles', 'source': 'HPA_main', ... +2}            │
│ ENSG00000070182 │ {'location': 'Cytoplasm', 'source': 'uniprot', ... +2}                    │
│ ENSG00000070182 │ {'location': 'Cytosol', 'source': 'HPA_main', ... +2}                     │
│ ENSG00000070366 │ {'location': 'Nucleus', 'source': 'uniprot', ... +2}                      │
│ ENSG00000070366 │ {'location': 'Chromosome', 'source': 'uniprot', ... +2}                   │
│ ENSG00000070366 │ {'location': 'Cytoplasm', 'source': 'uniprot', ... +2}                    │
│ ENSG00000070366 │ {'location': 'Nucleoli', 'source': 'HPA_main', ... +2}                    │
│ ENSG00000070366 │ {'location': 'Cytosol', 'source': 'HPA_additional', ... +2}               │
│ ENSG00000072071 │ {'location': 'Cell membrane', 'source': 'uniprot', ... +2}                │
│ …               │ …                                                                         │
└─────────────────┴───────────────────────────────────────────────────────────────────────────┘
  • We can take it a step further with "unpack", which we can use to turn a struct's keys into columns
In [5]:
(t_targets
 .select("id", _.subcellularLocations.unnest())
 .unpack("subcellularLocations")
)
Out[5]:
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓
┃ id              ┃ location         ┃ source         ┃ termSL  ┃ labelSL            ┃
┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩
│ string          │ string           │ string         │ string  │ string             │
├─────────────────┼──────────────────┼────────────────┼─────────┼────────────────────┤
│ ENSG00000020219 │ Cytoplasm        │ uniprot        │ SL-0086 │ Cellular component │
│ ENSG00000059588 │ Nuclear speckles │ HPA_main       │ SL-0186 │ Nucleus speckle    │
│ ENSG00000070182 │ Cytoplasm        │ uniprot        │ SL-0086 │ Cellular component │
│ ENSG00000070182 │ Cytosol          │ HPA_main       │ SL-0091 │ Cytosol            │
│ ENSG00000070366 │ Nucleus          │ uniprot        │ SL-0191 │ Cellular component │
│ ENSG00000070366 │ Chromosome       │ uniprot        │ SL-0468 │ Cellular component │
│ ENSG00000070366 │ Cytoplasm        │ uniprot        │ SL-0086 │ Cellular component │
│ ENSG00000070366 │ Nucleoli         │ HPA_main       │ SL-0188 │ Nucleolus          │
│ ENSG00000070366 │ Cytosol          │ HPA_additional │ SL-0091 │ Cytosol            │
│ ENSG00000072071 │ Cell membrane    │ uniprot        │ SL-0039 │ Cellular component │
│ …               │ …                │ …              │ …       │ …                  │
└─────────────────┴──────────────────┴────────────────┴─────────┴────────────────────┘
  • Table evaluation is lazy in Ibis, allowing us to safely chain these operations together without much overhead until we call .execute()
  • These operations are mapped onto SQL. To see what it looks like, you can use .compile()
In [6]:
str(t_targets
 .select("id", _.subcellularLocations.unnest())
 .unpack("subcellularLocations")
 .compile()
)
Out[6]:
'SELECT t0.id, struct_extract(t0."subcellularLocations", \'location\') AS location, struct_extract(t0."subcellularLocations", \'source\') AS source, struct_extract(t0."subcellularLocations", \'termSL\') AS "termSL", struct_extract(t0."subcellularLocations", \'labelSL\') AS "labelSL" \nFROM (SELECT t1.id AS id, unnest(t1."subcellularLocations") AS "subcellularLocations" \nFROM _ibis_read_parquet_9bs60q265u7upy5bflh2c30wb AS t1) AS t0'

Exploring Open Targets Molecules¶

  • These molecules are drugs that have associated targets.
  • Each molecule has an associated Simplified molecular-input line-entry system (SMILES) string
  • We can use these SMILES to draw molecules, create molecular fingerprints, and evaluate similarity between molecules
  • Many other options are worth exploring (such as retrieving Protein Databank Files for these molecules and using them to embed)

Molecule and Mechanism of Action Tables¶

In [7]:
t_molecule = ibis.read_parquet("../../../data/open_targets/molecule/")
t_molecule
Out[7]:
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ id            ┃ canonicalSmiles                                                                  ┃ inchiKey                    ┃ drugType       ┃ blackBoxWarning ┃ name                                ┃ yearOfFirstApproval ┃ maximumClinicalTrialPhase ┃ parentId      ┃ hasBeenWithdrawn ┃ isApproved ┃ withdrawnNotice ┃ tradeNames                                         ┃ synonyms                                                                     ┃ crossReferences                                   ┃ childChemblIds                     ┃ linkedDiseases                            ┃ linkedTargets                             ┃ description                                                                      ┃
┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ string        │ string                                                                           │ string                      │ string         │ boolean         │ string                              │ int64               │ int64                     │ string        │ boolean          │ boolean    │ struct<countri… │ array<string>                                      │ array<string>                                                                │ map<string, array<string>>                        │ array<string>                      │ struct<rows: array<string>, count: int32> │ struct<rows: array<string>, count: int32> │ string                                                                           │
├───────────────┼──────────────────────────────────────────────────────────────────────────────────┼─────────────────────────────┼────────────────┼─────────────────┼─────────────────────────────────────┼─────────────────────┼───────────────────────────┼───────────────┼──────────────────┼────────────┼─────────────────┼────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────┼───────────────────────────────────────────────────┼────────────────────────────────────┼───────────────────────────────────────────┼───────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┤
│ CHEMBL110739  │ C[C@]12C[C@H](O)[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C@@H]1CC[C@@H]2C(=O)CO       │ OMFXVFTZEKFJBZ-HJTSIMOOSA-N │ Small molecule │ False           │ CORTICOSTERONE                      │                NULL │                         3 │ NULL          │ False            │ False      │ NULL            │ []                                                 │ ['11-b,21-Dihydroxypregn-3,20-dione', '17-Deoxycortisol', ... +6]            │ {'PubChem': [...], 'Wikipedia': [...], ... +2}    │ NULL                               │ {'rows': [...], 'count': 1}               │ NULL                                      │ Small molecule drug with a maximum clinical trial phase of III and has 1 invest… │
│ CHEMBL1195    │ CCCOc1cc(N)ccc1C(=O)OCCN(CC)CC                                                   │ CAJIGINSTLKQMM-UHFFFAOYSA-N │ Small molecule │ False           │ PROPOXYCAINE                        │                1982 │                         4 │ NULL          │ False            │ True       │ NULL            │ []                                                 │ ['Propoxycaine']                                                             │ {'PubChem': [...], 'Wikipedia': [...], ... +2}    │ ['CHEMBL1769']                     │ NULL                                      │ {'rows': [...], 'count': 10}              │ Small molecule drug with a maximum clinical trial phase of IV that was first ap… │
│ CHEMBL1200632 │ CCCCCCCCCCCCCCCC(=O)O[C@@H]1[C@@H](O)[C@@H](O)[C@@H]([C@H](NC(=O)[C@@H]2C[C@@H]… │ GTNDZRUWKHDICY-DJHAJVGHSA-N │ Small molecule │ True            │ CLINDAMYCIN PALMITATE HYDROCHLORIDE │                1986 │                         4 │ CHEMBL1201289 │ False            │ True       │ NULL            │ ['Cleocin', 'Clindamycin palmitate hydrochloride'] │ ['Clindamycin palmitate hcl', 'Clindamycin palmitate hydrochloride', ... +4] │ {'DailyMed': [...], 'PubChem': [...]}             │ NULL                               │ {'rows': [...], 'count': 3}               │ {'rows': [...], 'count': 0}               │ Small molecule drug with a maximum clinical trial phase of IV (across all indic… │
│ CHEMBL1200691 │ CC(=O)[O-].CC(=O)[O-].[Mg+2]                                                     │ UEGPKNKPLBYCNK-UHFFFAOYSA-L │ Small molecule │ False           │ MAGNESIUM ACETATE                   │                NULL │                         4 │ NULL          │ False            │ True       │ NULL            │ []                                                 │ ['Acetic acid, magnesium salt', 'Magnesium acetate', ... +3]                 │ {'DailyMed': [...], 'DrugCentral': [...], ... +4} │ ['CHEMBL3989858']                  │ NULL                                      │ NULL                                      │ Small molecule drug with a maximum clinical trial phase of IV.                   │
│ CHEMBL1201042 │ CC(O)(P(=O)([O-])O)P(=O)([O-])O.[Na+].[Na+]                                      │ GWBBVOVXJZATQQ-UHFFFAOYSA-L │ Small molecule │ False           │ ETIDRONATE DISODIUM                 │                1977 │                         4 │ CHEMBL871     │ False            │ True       │ NULL            │ ['Didronel', 'Didronel iv', ... +2]                │ ['Disodium etidronate', 'Etidronate disodium', ... +3]                       │ {'PubChem': [...], 'chEBI': [...]}                │ NULL                               │ NULL                                      │ {'rows': [...], 'count': 0}               │ Small molecule drug with a maximum clinical trial phase of IV that was first ap… │
│ CHEMBL121790  │ Cc1cccc(-c2nn3c(c2-c2ccc(F)cc2)CCC3)n1                                           │ NBDZLUOYAAVYHF-UHFFFAOYSA-N │ Small molecule │ False           │ CHEMBL121790                        │                NULL │                         0 │ NULL          │ False            │ False      │ NULL            │ []                                                 │ []                                                                           │ {'drugbank': [...]}                               │ NULL                               │ NULL                                      │ NULL                                      │ Small molecule drug with a maximum clinical trial phase of I.                    │
│ CHEMBL1231    │ CCN(CC)CC#CCOC(=O)C(O)(c1ccccc1)C1CCCCC1                                         │ XIQVNETUBQGFHX-UHFFFAOYSA-N │ Small molecule │ False           │ OXYBUTYNIN                          │                1975 │                         4 │ NULL          │ False            │ True       │ NULL            │ ['Anturol', 'Contimin 2.5', ... +11]               │ ['Ditropan', 'Oxybutynin']                                                   │ {'DailyMed': [...], 'PubChem': [...], ... +3}     │ ['CHEMBL1133']                     │ {'rows': [...], 'count': 15}              │ {'rows': [...], 'count': 2}               │ Small molecule drug with a maximum clinical trial phase of IV (across all indic… │
│ CHEMBL1231592 │ CC(C)CCC[C@@H](C)[C@H]1CC[C@H]2[C@@H]3CC=C4C[C@@H](OS(=O)(=O)O)CC[C@]4(C)[C@H]3… │ BHYOQNUELFTYRT-DPAQBDIFSA-N │ Small molecule │ False           │ CHEMBL1231592                       │                NULL │                         0 │ NULL          │ False            │ False      │ NULL            │ []                                                 │ []                                                                           │ {'drugbank': [...], 'chEBI': [...]}               │ ['CHEMBL4475544']                  │ NULL                                      │ NULL                                      │ Small molecule drug with a maximum clinical trial phase of I.                    │
│ CHEMBL1232182 │ OCc1cccc(F)c1F                                                                   │ JSFGDUIJQWWBGY-UHFFFAOYSA-N │ Small molecule │ False           │ 2,3-Difluorobenzyl Alcohol          │                NULL │                         0 │ NULL          │ False            │ False      │ NULL            │ []                                                 │ ['2,3-Difluorobenzyl Alcohol']                                               │ {'drugbank': [...], 'chEBI': [...]}               │ NULL                               │ NULL                                      │ NULL                                      │ Small molecule drug with a maximum clinical trial phase of I.                    │
│ CHEMBL1233511 │ O=P(O)(O)O[C@H]1[C@H](OP(=O)(O)O)[C@@H](OP(=O)(O)O)[C@H](OP(=O)(O)O)[C@@H](OP(=… │ IMQLKJBTEOYOSI-GPIVLXJGSA-N │ Small molecule │ False           │ PHYTIC ACID                         │                NULL │                         3 │ NULL          │ False            │ False      │ NULL            │ []                                                 │ ['Alkalovert', 'Dermofeel pa-3', ... +9]                                     │ {'drugbank': [...], 'chEBI': [...]}               │ ['CHEMBL3989600', 'CHEMBL2106435'] │ {'rows': [...], 'count': 1}               │ NULL                                      │ Small molecule drug with a maximum clinical trial phase of III and has 1 invest… │
│ …             │ …                                                                                │ …                           │ …              │ …               │ …                                   │                   … │                         … │ …             │ …                │ …          │ …               │ …                                                  │ …                                                                            │ …                                                 │ …                                  │ …                                         │ …                                         │ …                                                                                │
└───────────────┴──────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────┴────────────────┴─────────────────┴─────────────────────────────────────┴─────────────────────┴───────────────────────────┴───────────────┴──────────────────┴────────────┴─────────────────┴────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────┴───────────────────────────────────────────────────┴────────────────────────────────────┴───────────────────────────────────────────┴───────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┘
In [8]:
t_mechanismOfAction = ibis.read_parquet("../../../data/open_targets/mechanismOfAction/")
t_mechanismOfAction
Out[8]:
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ actionType                    ┃ mechanismOfAction                                           ┃ chemblIds                          ┃ targetName                                   ┃ targetType            ┃ targets                                        ┃ references                                                             ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ string                        │ string                                                      │ array<string>                      │ string                                       │ string                │ array<string>                                  │ array<struct<source: string, ids: array<string>, urls: array<string>>> │
├───────────────────────────────┼─────────────────────────────────────────────────────────────┼────────────────────────────────────┼──────────────────────────────────────────────┼───────────────────────┼────────────────────────────────────────────────┼────────────────────────────────────────────────────────────────────────┤
│ ANTAGONIST                    │ Orexin receptor 2 antagonist                                │ ['CHEMBL3545367']                  │ Orexin receptor 2                            │ single protein        │ ['ENSG00000137252']                            │ [{...}]                                                                │
│ ANTAGONIST                    │ Gonadotropin-releasing hormone receptor antagonist          │ ['CHEMBL2028987', 'CHEMBL415606']  │ Gonadotropin-releasing hormone receptor      │ single protein        │ ['ENSG00000109163']                            │ [{...}]                                                                │
│ INHIBITOR                     │ Topoisomerase IV                                            │ ['CHEMBL8']                        │ Topoisomerase IV                             │ protein complex       │ []                                             │ [{...}]                                                                │
│ INHIBITOR                     │ DNA polymerase/reverse transcriptase inhibitor              │ ['CHEMBL1652128']                  │ DNA polymerase/reverse transcriptase         │ single protein        │ []                                             │ [{...}]                                                                │
│ AGONIST                       │ Insulin receptor agonist                                    │ ['CHEMBL1201664']                  │ Insulin receptor                             │ single protein        │ ['ENSG00000171105']                            │ [{...}]                                                                │
│ INHIBITOR                     │ Hepatitis A virus cellular receptor 2 inhibitor             │ ['CHEMBL4298123']                  │ Hepatitis A virus cellular receptor 2        │ single protein        │ ['ENSG00000135077']                            │ [{...}, {...}]                                                         │
│ ANTAGONIST                    │ Muscle-type nicotinic acetylcholine receptor antagonist     │ ['CHEMBL1200549', 'CHEMBL1201352'] │ Muscle-type nicotinic acetylcholine receptor │ protein complex group │ ['ENSG00000138435', 'ENSG00000196811', ... +3] │ [{...}, {...}]                                                         │
│ ANTAGONIST                    │ Endothelin receptor ET-A antagonist                         │ ['CHEMBL23261']                    │ Endothelin receptor ET-A                     │ single protein        │ ['ENSG00000151617']                            │ [{...}]                                                                │
│ INHIBITOR                     │ Poly [ADP-ribose] polymerase-1 inhibitor                    │ ['CHEMBL3137318', 'CHEMBL3137320'] │ Poly [ADP-ribose] polymerase-1               │ single protein        │ ['ENSG00000143799']                            │ [{...}]                                                                │
│ POSITIVE ALLOSTERIC MODULATOR │ GABA receptor alpha-3 subunit positive allosteric modulator │ ['CHEMBL1783256']                  │ GABA receptor alpha-3 subunit                │ single protein        │ ['ENSG00000011677']                            │ [{...}]                                                                │
│ …                             │ …                                                           │ …                                  │ …                                            │ …                     │ …                                              │ …                                                                      │
└───────────────────────────────┴─────────────────────────────────────────────────────────────┴────────────────────────────────────┴──────────────────────────────────────────────┴───────────────────────┴────────────────────────────────────────────────┴────────────────────────────────────────────────────────────────────────┘

Labelling molecules with mechanism of action¶

In [9]:
molecules_with_labels = (t_mechanismOfAction
 
 # We're going to use these as labels, so make sure they're not null
 .dropna(_.actionType) 
 
 # We're going to unnest chemblIds and change the name (to match t_molecule)
 .select(_.actionType, id = _.chemblIds.unnest())
 
 # For this demo, just considering the first acitonType associated with a drug
 .group_by(_.id)
 .agg(actionType = _.actionType.first())
 
 # Now we can join with the molecule table and make sure there are SMILES strings
 .inner_join(t_molecule, "id")
 .dropna("canonicalSmiles")
)
  • Note that we're considering only a subset of Open Targets molecules because of our filters/joins:
In [10]:
print("Before Filter: {}".format(t_molecule.count()))
print("After Filter: {}".format(molecules_with_labels.count()))


Before Filter: 12854



After Filter: 4126


Using SMILES¶

In [11]:
# Pulling table into a Pandas Dataframe
df = (molecules_with_labels
      .select("id", "actionType", "name", "canonicalSmiles")
      .execute())

# Using RDKIT to create molecular fingerprints from smiles
mols = []
for (mol_name, mol_smile) in zip(df["name"], df["canonicalSmiles"]):
    mol = Chem.MolFromSmiles(mol_smile)
    mol.SetProp('_Name', mol_name)
    mols.append(mol)
In [12]:
# Let's visualize some of these molecules
Draw.MolsToGridImage(mols, molsPerRow=10,
                     subImgSize=(150,150), maxMols=20,
                     legends=[mol.GetProp('_Name') for mol in mols])
Out[12]:

Creating Molecule Fingerprints and Deriving Similarity Matrix¶

Processing¶

  • We are using RDKit to create fingerprints
  • A good tutorial can be found here
  • There are plenty of other chemoinformatics tools that could be useful to explore as well!
In [13]:
fingerprints = [FingerprintMols.FingerprintMol(mol) for mol in mols]
sim_matrix = []
for f1 in fingerprints:
    sim_row = []
    for f2 in fingerprints:
         # should be symmetric, so could have just done upper triangle
        sim_row.append(DataStructs.FingerprintSimilarity(f1, f2))
    sim_matrix.append(sim_row)
sim_matrix = np.asarray(sim_matrix)   

Heatmap¶

  • We're going to visualize a small subset of the similarity matrix (40 x 40 piece)
  • You'll need to zoom in to see all the labels
  • Double-click to reset the zoom level, or click the "Reset Axis" (home button)
    • It's a little smooshed because of the length of the labels
In [14]:
sub_mat = sim_matrix[0:40].T[0:40].T
config = dict(scrollZoom=True, doubleClick='reset')
heatmap = px.imshow(sub_mat, x=df["name"][0:40], y=df["name"][0:40])
heatmap.show(config=config)

Deriving Cluster Map¶

  • We can also derive a cluster map from that subset to see related drugs
In [15]:
import plotly.figure_factory as ff
In [16]:
labels = [mol.GetProp('_Name') for mol in mols[0:40]]
dendro_fig = ff.create_dendrogram(sub_mat, labels=labels, orientation='left')
config = dict(scrollZoom=True, doubleClick='reset')
dendro_fig.show(config=config)

UMAP¶

  • We'll be covering umap embeddings more in the other demos
  • But for now, we can take that similarity matrix we made (the entire one, not just the subset) and embed drugs using UMAP
  • We can also assign useful mouse-over information and colors to help give insight
  • The "actionType" labels come from what we retrieved earlier. You can click on them to filter them out, and double-click on them to isolate them.
In [17]:
embeddings = umap.UMAP(n_components=3).fit_transform(sim_matrix).T
df["x"], df["y"], df["z"] = embeddings
df["index"] = list(range(len(df)))
  • Visualization
In [18]:
fig = px.scatter_3d(df, x="x", y="y", z="z",
                 hover_name="name", hover_data=["index"], color="actionType")
fig.update_traces(marker=dict(size=2))
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
fig.show()